library("sleuth")
## Loading required package: ggplot2
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
base_dir <- "../results"

Load subsamples

all_ss_directory <- Sys.glob("../results/ss*")

# sanity check
temp <- sapply(all_ss_directory, function(x) {
  strsplit(x, '/')[[1]][3]
}, USE.NAMES = FALSE)
all.equal(temp, all_ss_id$sample)
## [1] "35 string mismatches"
# all_ss_id <- mutate(all_ss_id, path = all_ss_directory)

so <- sleuth_prep(all_ss_directory, all_ss_id, ~1)
## reading in kallisto results
## ........................................
## 
## normalizing est_counts
## 81208 targets passed the filter
## normalizing tpm
## merging in metadata
## normalizing bootstrap samples
# so <- sleuth_prep(all_ss_id, ~1)
ss_summary <- so$obs_norm %>%
  group_by(target_id) %>%
  summarise(
    mean_tpm = mean(tpm),
    sd_tpm = sd(tpm),
    var_tpm = var(tpm),
    cv_tpm = sd_tpm / mean_tpm,
    mean_est_counts = mean(est_counts),
    sd_est_counts = sd(est_counts),
    var_est_counts = var(est_counts),
    cv_est_counts = sd_est_counts / mean_est_counts
    )
ss_summary
## Source: local data frame [198,457 x 9]
## 
##          target_id    mean_tpm     sd_tpm      var_tpm     cv_tpm
##              (chr)       (dbl)      (dbl)        (dbl)      (dbl)
## 1  ENST00000000233 107.7728992 3.42522541 11.732169140 0.03178188
## 2  ENST00000000412  55.5685781 2.58297827  6.671776760 0.04648271
## 3  ENST00000000442  20.0162404 1.64091801  2.692611903 0.08197933
## 4  ENST00000001008 112.2844862 1.21094225  1.466381144 0.01078459
## 5  ENST00000001146   0.3376546 0.20809803  0.043304792 0.61630450
## 6  ENST00000002125   7.7191243 0.67581750  0.456729298 0.08755106
## 7  ENST00000002165  63.3896571 1.16449783  1.356055197 0.01837047
## 8  ENST00000002501  25.2559906 1.02167625  1.043822358 0.04045283
## 9  ENST00000002596   0.1693633 0.02921479  0.000853504 0.17249778
## 10 ENST00000002829   3.0708259 0.63745593  0.406350064 0.20758452
## ..             ...         ...        ...          ...        ...
## Variables not shown: mean_est_counts (dbl), sd_est_counts (dbl),
##   var_est_counts (dbl), cv_est_counts (dbl)

aggregate by genes

mart <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL",
  dataset = "hsapiens_gene_ensembl", host="sep2015.archive.ensembl.org")
## Creating a generic function for 'nchar' from package 'base' in package 'S4Vectors'
t2g <- biomaRt::getBM(
  attributes = c("ensembl_transcript_id", "ensembl_gene_id",
    "external_gene_name"), mart = mart)
t2g <- dplyr::rename(t2g,
  target_id = ensembl_transcript_id, ens_gene = ensembl_gene_id)
gene_subsample_counts <- so$obs_norm %>%
  left_join(t2g, by = "target_id") %>%
  group_by(ens_gene, sample) %>%
  summarize(sum = sum(est_counts)) %>%
  summarize(mean_est_counts = mean(sum),
    var_est_counts = var(sum),
    sd_est_counts = sd(sum))

Load bootstrap

## Found 40 bootstrap samples
collapse_genes <- function(data, mapping) {
  data <- left_join(data, mapping, by = 'target_id')
  data %>%
    group_by(ens_gene) %>%
    summarize(
      est_counts = sum(est_counts),
      tpm = sum(tpm)) %>%
    rename(target_id = ens_gene)
}
bs_kal_gene <- bs_kal
bs_kal_gene$bootstrap <- lapply(bs_kal_gene$bootstrap,
  function(y) {
    collapse_genes(y, t2g)
    })
bs_kal_gene$abundance <- collapse_genes(bs_kal_gene$abundance, t2g)
bs_gene_summary <- sleuth:::summarize_bootstrap(bs_kal_gene, "est_counts")

Variance estimation

Variance estimation for other methods

ss_summary

## debugging in: compute_correlation(rsem_df, ss_summary, c(transcript_id = "target_id"))
## debug at <text>#16: {
##     tmp <- inner_join(df, the_summary, by = which_id)
##     tmp <- filter(tmp, !is.na(count_var) & is.finite(var_est_counts) & 
##         is.finite(count_var))
##     tmp <- mutate(tmp, count_var = count_var + est_counts)
##     with(tmp, cor(var_est_counts, count_var))
## }
## debug at <text>#17: tmp <- inner_join(df, the_summary, by = which_id)
## debug at <text>#18: tmp <- filter(tmp, !is.na(count_var) & is.finite(var_est_counts) & 
##     is.finite(count_var))
## debug at <text>#19: tmp <- mutate(tmp, count_var = count_var + est_counts)
## debug at <text>#21: with(tmp, cor(var_est_counts, count_var))

Mean-variance relationship

Subsampled:

## Warning: Removed 9562 rows containing missing values (geom_point).

## Warning: Removed 9562 rows containing missing values (geom_point).

Bootstrapped:

## Warning: Removed 7250 rows containing missing values (geom_point).

## Warning: Removed 7250 rows containing missing values (geom_point).

Gene Mean Variance Relationship

Subsampled:

## Warning: Removed 2209 rows containing missing values (geom_point).

## Warning: Removed 2209 rows containing missing values (geom_point).

bootstrapped:

## Warning: Removed 711 rows containing missing values (geom_point).

## Warning: Removed 711 rows containing missing values (geom_point).
sessionInfo()
## R version 3.2.3 (2015-12-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.3 LTS
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] utils   stats   methods base   
## 
## other attached packages:
## [1] sleuth_0.27.3 dplyr_0.4.3   ggplot2_2.0.0 knitr_1.12.3 
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.3          GenomeInfoDb_1.4.3   formatR_1.2.1       
##  [4] plyr_1.8.3           bitops_1.0-6         tools_3.2.3         
##  [7] grDevices_3.2.3      zlibbioc_1.14.0      biomaRt_2.24.1      
## [10] digest_0.6.9         evaluate_0.8         RSQLite_1.0.0       
## [13] gtable_0.1.2         rhdf5_2.12.0         DBI_0.3.1           
## [16] yaml_2.1.13          parallel_3.2.3       stringr_1.0.0       
## [19] IRanges_2.2.9        S4Vectors_0.6.6      graphics_3.2.3      
## [22] datasets_3.2.3       stats4_3.2.3         grid_3.2.3          
## [25] data.table_1.9.6     Biobase_2.28.0       R6_2.1.2            
## [28] AnnotationDbi_1.30.1 XML_3.98-1.3         rmarkdown_0.9.2     
## [31] tidyr_0.4.1          magrittr_1.5         scales_0.3.0        
## [34] htmltools_0.3        BiocGenerics_0.14.0  assertthat_0.1      
## [37] colorspace_1.2-6     labeling_0.3         stringi_1.0-1       
## [40] RCurl_1.95-4.7       lazyeval_0.1.10      munsell_0.4.2       
## [43] chron_2.3-47